{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "pyYnq_d3jX9y"
      },
      "source": [
        "## Loading the dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "oAKgqSfg4Av_",
        "outputId": "2efaaeba-9191-4899-cc23-a30daead997d"
      },
      "outputs": [],
      "source": [
        "!pip install --upgrade gdown"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "o_YiySaL5JXh",
        "outputId": "de9b84e1-f860-42a1-f5c2-33d3c5f8d7c0"
      },
      "outputs": [],
      "source": [
        "!gdown --id 12vfq3DYFId3bsXuNj_PhsACMzrLTfObs"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "95RkXV8bgVAV",
        "outputId": "e1db356b-e9e0-4e61-fe7a-4ed19f740637"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "from sklearn.model_selection import train_test_split\n",
        "from imblearn.over_sampling import SMOTE\n",
        "from sklearn.utils import resample\n",
        "import pandas as pd\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.metrics import roc_auc_score, classification_report\n",
        "from sklearn.metrics import roc_auc_score\n",
        "from sklearn.metrics import confusion_matrix\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "import statsmodels.api as sm\n",
        "import seaborn as sns\n",
        "from sklearn.preprocessing import OrdinalEncoder"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 427
        },
        "id": "LbJgEpuFsXo8",
        "outputId": "55c9df3b-26bc-4fda-90db-14a88914f4cf"
      },
      "outputs": [],
      "source": [
        "data=pd.read_csv(\"data_regression.csv\")\n",
        "# get the first 10 rows\n",
        "data.head(10)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wc6EPUEBWvIq"
      },
      "outputs": [],
      "source": [
        "# check for the missing values and dataframes\n",
        "def datainspection(dataframe):\n",
        "  print(\"Types of the variables we are working with:\")\n",
        "  print(dataframe.dtypes)\n",
        "  \n",
        "  print(\"Total Samples with missing values:\")\n",
        "\n",
        "  print(data.isnull().any(axis=1).sum()) # null values\n",
        "\n",
        "  print(\"Total Missing Values per Variable\")\n",
        "  print(data.isnull().sum())\n",
        "  print(\"Map of missing values\")\n",
        "  sns.heatmap(dataframe.isnull())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "aB6NkvMPX9Fx",
        "outputId": "ad7a6b57-1cfb-4e30-9288-83191e943c59"
      },
      "outputs": [],
      "source": [
        "datainspection(data)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kQVFmR-pjiJj"
      },
      "outputs": [],
      "source": [
        "data = data.dropna() # cleaning up null values"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 270
        },
        "id": "bRdAmm8yvWJJ",
        "outputId": "071d8e5e-58be-4e3e-f6c8-136bd932dfd7"
      },
      "outputs": [],
      "source": [
        "# function for encoding categorical variables\n",
        "def encode_cat(data, vars):\n",
        "  ord_en = OrdinalEncoder() \n",
        "  for v in vars:\n",
        "    name = v+'_code' # add _code for encoded variables\n",
        "    data[name] = ord_en.fit_transform(data[[v]])\n",
        "    print('The encoded values for '+ v + ' are:')\n",
        "    print(data[name].unique())\n",
        "  return data\n",
        "data.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 379
        },
        "id": "xGnuojAZzgSG",
        "outputId": "9df3f4b2-d04d-427c-e1c3-cf0febfbcc09"
      },
      "outputs": [],
      "source": [
        "# check for the encoded variables\n",
        "data = encode_cat(data, ['gender', 'multi_screen', 'mail_subscribed'])\n",
        "data.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dVcl9m6-XhX2"
      },
      "outputs": [],
      "source": [
        "def full_plot(data, class_col, cols_to_exclude):\n",
        "  cols = data.select_dtypes(include=np.number).columns.tolist() # finding all the numerical columns from the dataframe\n",
        "  X = data[cols] # creating a dataframe only with the numerical columns\n",
        "  X = X[X.columns.difference(cols_to_exclude)] # columns to exclude\n",
        "  X = X[X.columns.difference([class_col])]\n",
        "  sns.pairplot(data, hue=class_col)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "mhtOwq2-YADb",
        "outputId": "185665dd-2c1d-4936-a34d-076ea1f2bd2d"
      },
      "outputs": [],
      "source": [
        "full_plot(data,class_col='churn', cols_to_exclude=['customer_id','phone_no', 'year'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "KkQ0mOtRUQOb"
      },
      "outputs": [],
      "source": [
        "# function for creating plots for selective columns only\n",
        "def selected_diagnotic(data,class_col, cols_to_eval):\n",
        "  cols_to_eval.append(class_col) \n",
        "  X = data[cols_to_eval] # only selective columns\n",
        "  sns.pairplot(X, hue=class_col) # plot"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 374
        },
        "id": "on5q6dJuWqG_",
        "outputId": "32663e8c-deb5-4ba4-8fdf-c1433adedf10"
      },
      "outputs": [],
      "source": [
        "selected_diagnotic(data, class_col='churn', cols_to_eval=['videos_watched', 'no_of_days_subscribed'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9hOwb2lcZjOZ"
      },
      "outputs": [],
      "source": [
        "def logistic_regression(data, class_col, cols_to_exclude):\n",
        "  cols = data.select_dtypes(include=np.number).columns.tolist() \n",
        "  X = data[cols]\n",
        "  X = X[X.columns.difference([class_col])] \n",
        "  X = X[X.columns.difference(cols_to_exclude)] # unwanted columns \n",
        "\n",
        "  y = data[class_col] # the target variable \n",
        "  logit_model = sm.Logit(y,X) \n",
        "  result = logit_model.fit() # fit the model \n",
        "  print(result.summary2()) # check for summary "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "CylJ_cx8lLgS",
        "outputId": "021114d2-813c-4579-aba0-b0112da318e7"
      },
      "outputs": [],
      "source": [
        "logistic_regression(data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "H9VkVEB6lTvZ"
      },
      "outputs": [],
      "source": [
        "def prepare_data(data, class_col, cols_to_exclude):\n",
        "  ## Split in training and test set\n",
        "  ## Selecting only the numerical columns and excluding the columns we specified in the function\n",
        "  cols = data.select_dtypes(include=np.number).columns.tolist() \n",
        "  X = data[cols]\n",
        "  X = X[X.columns.difference([class_col])] \n",
        "  X = X[X.columns.difference(cols_to_exclude)]\n",
        "  ## Selecting y as a column\n",
        "  y = data[class_col]\n",
        "  return train_test_split(X, y, test_size=0.3, random_state=0) # perform train test split"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "HaJzCmF0l6n9"
      },
      "outputs": [],
      "source": [
        "def run_model(X_train, X_test, y_train, y_test):\n",
        "  # Fitting the logistic regression\n",
        "  logreg = LogisticRegression(random_state=13)\n",
        "  logreg.fit(X_train, y_train) # fit the model\n",
        "  # Predicting y values\n",
        "  y_pred = logreg.predict(X_test) # make predictions on th test data\n",
        "  logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))\n",
        "  print(classification_report(y_test, y_pred)) # check for classification report \n",
        "  print(\"The area under the curve is:\", logit_roc_auc)  # check for AUC\n",
        "  return y_pred"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "GsjB3X51m5Fh",
        "outputId": "0dce3d78-e373-42d7-e5ea-f63b6b9991ab"
      },
      "outputs": [],
      "source": [
        "X_train, X_test, y_train, y_test = prepare_data(data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])\n",
        "y_pred = run_model(X_train, X_test, y_train, y_test)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "l1foHWuxfpr7"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import confusion_matrix\n",
        "\n",
        "def confusion_m(y_test, y_pred):\n",
        "  cm = confusion_matrix(y_test, y_pred)\n",
        "  print(cm)\n",
        "  tn, fp, fn, tp = cm.ravel()\n",
        "  print(\"TN:\", tn)\n",
        "  print(\"TP:\", tp)\n",
        "  print(\"FN:\", fn)\n",
        "  print(\"FP:\", fp)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "cipNEx9R9iRE",
        "outputId": "09c7dcb6-3923-46c7-e0dc-3a21721fb343"
      },
      "outputs": [],
      "source": [
        "## Call the function\n",
        "confusion_m(y_test, y_pred)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ksRCpBZCng5k"
      },
      "outputs": [],
      "source": [
        "# class imbalance method 1 \n",
        "def run_model_bweights(X_train, X_test, y_train, y_test):\n",
        "    logreg = LogisticRegression(random_state=13, class_weight='balanced') # define class_weight parameter\n",
        "    logreg.fit(X_train, y_train) # fit the model \n",
        "    y_pred = logreg.predict(X_test) # predict on test data\n",
        "    logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test)) # ROC AUC score\n",
        "    print(classification_report(y_test, y_pred)) \n",
        "    print(\"The area under the curve is:\", logit_roc_auc) # AUC curve"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "wAWyH-cBoYow",
        "outputId": "24347c88-d87f-4bc8-fe08-f6314d83bad7"
      },
      "outputs": [],
      "source": [
        "run_model_bweights(X_train, X_test, y_train, y_test)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "eHQ5X6-Dobc6"
      },
      "outputs": [],
      "source": [
        "# class imbalance method 2\n",
        "def run_model_aweights(X_train, X_test, y_train, y_test, w):\n",
        "    logreg = LogisticRegression(random_state=13, class_weight=w) # define class_weight parameter\n",
        "    logreg.fit(X_train, y_train) # fit the model \n",
        "    y_pred = logreg.predict(X_test) # predict on test data\n",
        "    logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))  # ROC AUC score\n",
        "    print(classification_report(y_test, y_pred))\n",
        "    print(\"The area under the curve is: %0.2f\"%logit_roc_auc)  # AUC curve"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XCU6OwiNxabC",
        "outputId": "5eb4ea9a-d72b-4611-8ab9-c6881bf394a6"
      },
      "outputs": [],
      "source": [
        "run_model_aweights(X_train,X_test,y_train,y_test,{0:90, 1:10})"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "PoF1mh1xopI9"
      },
      "outputs": [],
      "source": [
        "# class imbalance method 3\n",
        "def adjust_imbalance(X_train, y_train, class_col):\n",
        "  X = pd.concat([X_train, y_train], axis=1)\n",
        "  # separate the 2 classes. Here we divide majority and minority classes\n",
        "  class0 = X[X[class_col] == 0]\n",
        "  class1 = X[X[class_col] == 1]\n",
        "  # Case 1 - bootstraps from the minority class\n",
        "  if len(class1)<len(class0):\n",
        "    resampled = resample(class1,\n",
        "                              replace=True, # Upsampling with replacement\n",
        "                              n_samples=len(class0), ## Number to match majority class\n",
        "                              random_state=10) \n",
        "    resampled_data = pd.concat([resampled, class0]) ## # Combination of majority and upsampled minority class\n",
        "  # Case 1 - resamples from the majority class\n",
        "  else:\n",
        "    resampled = resample(class1,\n",
        "                              replace=False, ## false instead of True like above\n",
        "                              n_samples=len(class0), \n",
        "                              random_state=10) \n",
        "    resampled_data = pd.concat([resampled, class0])\n",
        "  return resampled_data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ztC2PFvPsE70"
      },
      "outputs": [],
      "source": [
        "## Call the function\n",
        "resampled_data = adjust_imbalance(X_train, y_train, class_col='churn')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "fTWnG5RBqf7f",
        "outputId": "8adb9d43-543e-4695-ee50-b899941db5f3"
      },
      "outputs": [],
      "source": [
        "X_train, X_test, y_train, y_test = prepare_data(resampled_data, class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])\n",
        "run_model(X_train, X_test, y_train, y_test)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zy5_fe3xrt_k"
      },
      "outputs": [],
      "source": [
        "def prepare_data_smote(data,class_col,cols_to_exclude):\n",
        "  # Synthetic Minority Oversampling Technique. \n",
        "  # Generates new instances from existing minority cases that you supply as input. \n",
        "  cols = data.select_dtypes(include=np.number).columns.tolist() \n",
        "  X = data[cols]\n",
        "  X = X[X.columns.difference([class_col])]\n",
        "  X = X[X.columns.difference(cols_to_exclude)]\n",
        "  y = data[class_col]\n",
        "  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
        "  sm = SMOTE(random_state=0, sampling_strategy=1.0)\n",
        "  # run SMOTE on training set only\n",
        "  X_train, y_train = sm.fit_resample(X_train, y_train)\n",
        "  return X_train, X_test, y_train, y_test"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "b2N_k-aCs8ck",
        "outputId": "06770f5c-366e-4aad-fd84-475a2c2652b8"
      },
      "outputs": [],
      "source": [
        "X_train, X_test, y_train, y_test = prepare_data_smote(data,class_col='churn', cols_to_exclude=['customer_id', 'phone_no', 'year'])\n",
        "run_model(X_train, X_test, y_train, y_test)"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "collapsed_sections": [],
      "name": "Handling_Imbalance_Datasets_PythonCodeTutorial.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
